Analysis of frog monitoring data from 6 years at Barmah-Millewa forest.
This document is intended as supplementary material to the analysis of acoustic frog data between 2018 and 2024 (6 survey years). This document provides relevant R code to process data from the acoustic classifier, extract relevent environmental variables, run a multi-season multi-species occupancy model and interrogate the results.
Load relevant R packages and state settings for modeling.
library(tidyverse)
library(sf)
library(mapview)
library(terra)
library(readr)
library(readxl)
library(cmdstanr)
library(bayesplot)
library(httr)
library(VicmapR)
library(ggridges)
library(kableExtra)
check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)
options(mc.cores=4)
register_knitr_engine(override = FALSE)
Source a range of handy functions for summarily and plotting outputs from a STAN model. These were used in a different project and are stored on github.
# Source useful functions from the deer project github page
req <- GET("https://api.github.com/repos/JustinCally/statewide-deer-analysis/git/trees/main?recursive=1")
stop_for_status(req)
filelist <- unlist(lapply(content(req)$tree, "[", "path"), use.names = F)
function_files <- paste0("https://raw.githubusercontent.com/JustinCally/statewide-deer-analysis/main/",
grep("functions/", filelist, value = TRUE, fixed = TRUE))
sapply(function_files, source, verbose = F)
source("functions/ppc_plots.R")
Below we load in site metadata for the analysis, which includes the location and deployment dates for the audiomoths.
site_metadata <- read_excel("data/TLM_Barmah-Millewa_AM_deployment_metadata_all_years.xlsx")
# site_data_habitat <- read_excel("data/site_data/TLM_habitat_transect_data_all_years.xlsx") %>%
# group_by(Site, Date) %>%
# summarise(across(c("Depth", "Emergent macrophyte", "Submerged macrophyte",
# "Floating macrophyte", "CWD", "Bare ground"), mean)) %>%
# mutate(Year = as.integer(as.factor(year(Date))))
#
# site_data_habitat2 <- read_excel("data/site_data/TLM_water_quality_all_years.xlsx") %>%
# group_by(Site, Date) %>%
# summarise(across(c("Waterbody width (m)", "Waterbody length (m)", "Max depth (cm)",
# "Canopy cover (%)", "Conductivity", "pH", "Water temp (°C)",
# "Oxygen (mg/L)", "Oxygen sat (%)", "mmHg", "Turbidity (FNU)",
# "Turbidity (NTU)"), mean)) %>%
# mutate(Year = as.integer(as.factor(year(Date))))
# Read in water management area data
WMAs <- read_excel("data/WMAs/Sites_WMAs.xlsx")
WMA_scores <- read_excel("data/WMAs/WMA_flood_scores.xlsx")
WMA_joined <- left_join(WMAs, WMA_scores) %>%
select(WMA, `2018-19` = `2018`, `2019-20` = `2019`, `2020-21` = `2020`, `2021-22` = `2021`, `2022-23` = `2022`, `2023-24` = `2023`) %>%
pivot_longer(2:7, names_to = "Monitoring year", values_to = "FloodScore") %>%
distinct()
# Convert site metadata to a spatially explicit dataset and format
site_metadata_sf <- site_metadata %>%
left_join(WMA_joined, by = join_by(WMA, `Monitoring year`)) %>%
st_as_sf(coords = c("X", "Y"), crs = 28355) %>%
mutate(`Site name` = case_when(`Site` == "Warri" ~ "Warri Yards",
`Site` == "Hughes Crossing" ~ "Hughs Crossing",
Site == "Little Rushy Swamp" ~ "Little Rushy",
Site == "Reed Beds Swamp" ~ "Reed Bed Swamp",
Site == "Wathours Lagoon" ~ "Wathours",
TRUE ~ `Site`),
Site = paste(`Site name`, AM, sep = "_AM"),
Site_Year = paste(Site, `Monitoring year`, sep = "_"),
Year = factor(`Monitoring year`) %>% as.integer(),
Cluster = factor(`Site name`) %>% as.integer(),
Loc = factor(Site) %>% as.integer(),
No_survey_nights = as.numeric(No_PAM_nts)) %>%
# arrange(`Site`) %>%
filter(!is.na(No_survey_nights) & No_survey_nights > 0) %>%
arrange(Site_Year) %>%
filter(!(Site_Year %in% c("Rookery_AM1_2022-23", "Two Mile Bunyip_AM2_2021-22")))
unique_sites <- site_metadata_sf %>%
group_by(Site) %>%
slice(1) %>%
select(Site)
# get alist of the nights deployed
nights_dep <- list()
for(i in 1:nrow(site_metadata_sf)) {
nights_dep[[i]] <- data.frame(Site = site_metadata_sf$Site[i],
Site_Year = site_metadata_sf$Site_Year[i],
Date = seq.Date(from = as.Date(site_metadata_sf$`Date start`[i]),
by = "1 day",
length.out = site_metadata_sf$No_survey_nights[i])) #site_metadata_sf$No_survey_nights[i]))
}
nights_dep_joined <- bind_rows(nights_dep)
site_hull <- st_convex_hull(unique_sites %>% st_combine()) %>%
st_transform(3577) %>%
st_buffer(100)
hydro_data <- vicmap_query("open-data-platform:hy_water_area_polygon") %>%
filter(BBOX(site_hull)) %>%
collect()
elev_data <- vicmap_query("open-data-platform:el_contour") %>%
filter(BBOX(site_hull)) %>%
collect()
mapview(hydro_data, zcol = "feature_type_code") + mapview(site_metadata_sf)
sf::st_write(site_hull, "data/barmah_hull/barmah_hull.shp")
sf::st_write(site_hull %>% st_transform(4326), "data/barmah_hull/barmah_hull.geojson")
sf::st_write(unique_sites %>% st_transform(3577) %>% st_buffer(100), "data/site_metadata_sf/site_metadata_sf.shp", overwrite = T)
sf::st_write(unique_sites %>% st_buffer(10, endCapStyle = "SQUARE") %>% st_transform(4326) %>% st_combine(), "data/barmah_hull/site_locs.geojson", overwrite = T)
Read in the hydrological data. This data has been obtained through Digital Earth Austraia sandbox (https://app.sandbox.dea.ga.gov.au/), the code used to extract the hydrologial data and the csvs for each site are available on GitHub: https://github.com/JustinCally/dea-barmah
# load in csv files of hydro conditions
# list.files("https://github.com/JustinCally/dea-barmah/tree/main/output")
# req <- GET("https://api.github.com/repos/JustinCally/dea-barmah/git/trees/main?recursive=1")
# stop_for_status(req)
# filelist <- unlist(lapply(content(req)$tree, "[", "path"), use.names = F)
# csv_files <- paste0("https://raw.githubusercontent.com/JustinCally/dea-barmah/main/",
# grep("output/", filelist, value = TRUE, fixed = TRUE))
# csv_files <- grep(".ipynb_checkpoints", x = csv)
#
# sapply(function_files, source, verbose = F)
#paste
csv_files <- paste0("https://raw.githubusercontent.com/JustinCally/dea-barmah/main/output/", unique(site_metadata_sf$Site), ".csv")
csv_files <- stringr::str_replace_all(csv_files, pattern = " ", replacement = "%20")
hydryo_data <- list()
for(i in 1:length(csv_files)) {
hydryo_data[[i]] <- readr::read_csv(csv_files[i], show_col_types = F) %>%
mutate(Site = unique(site_metadata_sf$Site)[i])
}
hydro_data_combined <- bind_rows(hydryo_data)
Given that the data was often only captured every 2 weeks we want to obtain daily estimates of hydrological conditions. To this we interpolate the hydrological data across the time range of the study (2018 - early 2024). We do this by fitting a polynomial regression with the loess() function.
#hydro interpolation
new_df_raw <- data.frame(date = seq(from = min(hydro_data_combined$date),
to = max(hydro_data_combined$date),
by="day"))
interpolate_data <- list()
for(i in 1:length(csv_files)) {
hd <- hydryo_data[[i]]
mod_pv <- loess(pv ~ as.numeric(date), data = hd, span = 0.075)
mod_npv <- loess(npv ~ as.numeric(date), data = hd, span = 0.075)
mod_bs <- loess(bs ~ as.numeric(date), data = hd, span = 0.075)
mod_wet <- loess(wet ~ as.numeric(date), data = hd, span = 0.075)
mod_water <- loess(water ~ as.numeric(date), data = hd, span = 0.075)
mod_veg_areas <- loess(veg_areas ~ as.numeric(date), data = hd, span = 0.075)
interpolate_data[[i]] <- new_df_raw %>%
mutate(Site = unique(site_metadata_sf$Site)[i],
pv = predict(mod_pv, newdata = new_df_raw),
npv = predict(mod_npv, newdata = new_df_raw),
bs = predict(mod_bs, newdata = new_df_raw),
wet = predict(mod_wet, newdata = new_df_raw),
water = predict(mod_water, newdata = new_df_raw),
veg_areas = predict(mod_veg_areas, newdata = new_df_raw)) %>%
mutate(across(.cols = c(pv,npv,bs, wet, water, veg_areas),
.fns = ~ case_when(.x > 1 ~ 1,
.x < 0 ~ 0,
TRUE ~ .x)))
}
interpolate_combined <- bind_rows(interpolate_data) %>%
mutate(Date = as.Date(date)) %>%
select(-date)
interpolate_data[[62]] %>%
ggplot() +
geom_point(aes(x = date, y = pv)) +
ylim(c(0,1))
Using the National Oceanic and Atmospheric Administration (NOAA) data of gridded daily precipitation and temperature, we extract the daily precipitation and minimum temperature values estimated at each site at a broad spatial scale (0.5 x 0.5 degrees).
# load in daily precip
# https://psl.noaa.gov/data/gridded/data.cpc.globalprecip.html
precipitation_daily <- c(terra::rast("data/climate/precip_2018.nc"),
terra::rast("data/climate/precip_2019.nc"),
terra::rast("data/climate/precip_2020.nc"),
terra::rast("data/climate/precip_2021.nc"),
terra::rast("data/climate/precip_2022.nc"),
terra::rast("data/climate/precip_2023.nc"),
terra::rast("data/climate/precip_2024.nc"))
precipitation_daily_df <- bind_cols(site_metadata_sf$Site_Year,
extract(precipitation_daily, vect(site_metadata_sf)))
colnames(precipitation_daily_df) <- c("Site_Year","ID", as.character(seq.Date(from = as.Date("2018-01-01"),
to = as.Date("2024-05-13"), by = "day")))
precipitation_daily_long <- pivot_longer(precipitation_daily_df,
cols = 3:2327, names_to = "Date", values_to = "Precipitation") %>%
select(-ID) %>%
mutate(Date = as.Date(Date)) %>%
right_join(nights_dep_joined, by = join_by(Site_Year, Date))
# load in daily precip
# https://psl.noaa.gov/data/gridded/data.cpc.globalprecip.html
tmin_daily <- c(terra::rast("data/climate/precip_2018.nc"),
terra::rast("data/climate/tmin_2019.nc"),
terra::rast("data/climate/tmin_2020.nc"),
terra::rast("data/climate/tmin_2021.nc"),
terra::rast("data/climate/tmin_2022.nc"),
terra::rast("data/climate/tmin_2023.nc"),
terra::rast("data/climate/tmin_2024.nc"))
tmin_daily_df <- bind_cols(site_metadata_sf$Site_Year,
extract(tmin_daily, vect(site_metadata_sf)))
colnames(tmin_daily_df) <- c("Site_Year","ID", as.character(seq.Date(from = as.Date("2018-01-01"),
to = as.Date("2024-05-13"), by = "day")))
tmin_daily_long <- pivot_longer(tmin_daily_df,
cols = 3:2327,
names_to = "Date",
values_to = "tmin") %>%
select(-ID) %>%
mutate(Date = as.Date(Date)) %>%
right_join(nights_dep_joined, by = join_by(Site_Year, Date))
We also include two key flow variables in our analysis:
1. Whether the site was subject to any active regulation
2. When that active regulation occured via release of water through regulators.
flow_hist_b <- read_excel("data/reg_data/ALL_REG_DATA.xlsx", sheet = 1)
flow_hist_m <- read_excel("data/reg_data/ALL_REG_DATA.xlsx", sheet = 2)
flow_hist <- full_join(flow_hist_b, flow_hist_m) %>%
filter(Date > as.Date("2018-01-01")) %>%
mutate(Days = Date + lubridate::days(2))
reg_site_sizes <- read_excel("data/reg_data/Reg_Site Interactions.xlsx") %>%
mutate(gate_1_total = `No gates Size 1` * `Gate size 1 height` * `Gate size 1 width`,
gate_2_total = `No gates Size 2` * `Gate size 2 height` * `Gate size 2 width`,
gate_3_total = `No gates Size 3` * `Gates Size 3 height` * `Gates Size 3 width`,
gate_4_total = `No gates Size 4` * `Gates Size 4 height` * `Gates Size 4 width`,
gate_5_total = `No gates Size 5` * `Gates Size 5 height` * `Gates Size 5 width`,
gate_avg = sum(gate_1_total,gate_2_total,gate_3_total,gate_4_total,gate_5_total, na.rm = T)/`Total gates`,
gate_avg = coalesce(gate_avg, 0)) %>%
select(regulator = `Reg and overbank site details`, gate_avg)
reg_to_site <- read_excel("data/reg_data/Reg_Site Interactions.xlsx", sheet = 2) %>%
filter(`Regulation score` == "3")
reg_to_site_long <- reg_to_site %>%
pivot_longer(cols = 9:47, names_to = "regulator", values_to = "count", values_drop_na = T) %>%
select(Site = SITE, water_arrival = `Water arrival (days)`, regulator, count) %>%
mutate(`Site` = case_when(`Site` == "Warri" ~ "Warri Yards",
`Site` == "Hughes Crossing" ~ "Hughs Crossing",
Site == "Little Rushy Swamp" ~ "Little Rushy",
Site == "Reed Beds Swamp" ~ "Reed Bed Swamp",
Site == "Wathours Lagoon" ~ "Wathours",
TRUE ~ `Site`)) %>%
full_join(reg_site_sizes)
flow_hist_long <- flow_hist %>%
pivot_longer(cols = 5:34, names_to = "regulator", values_to = "n_gates") %>%
left_join(reg_to_site_long) %>%
group_by(Date, Site) %>%
summarise(total_gate_size_on = sum(n_gates * gate_avg)) %>%
ungroup()
flow_hist_long_c <- bind_rows(flow_hist_long %>% mutate(Site = paste0(Site, "_AM1")),
flow_hist_long %>% mutate(Site = paste0(Site, "_AM2"))) %>%
mutate(Regulated = TRUE)
Given some of the preceding code can take a while to run we save the output as an rds. This output is the daily values at each site for hydrological and climate conditions as well as the days since the start of spring (1st of September).
daily_vars <- left_join(precipitation_daily_long,
tmin_daily_long) %>%
mutate(Day = lubridate::yday(Date)-243,
Day = case_when(Day < 1 ~ Day + 243 + (365-243),
TRUE ~ Day),
cosDay = 2*pi*Day/365,
sinDay = 2*pi*Day/365) %>%
inner_join(interpolate_combined, by = c("Site", "Date")) %>%
distinct()
saveRDS(daily_vars, "data/intermediate/daily_vars.rds")
# read in daily vars
daily_vars <- readRDS("data/intermediate/daily_vars.rds") %>%
left_join(flow_hist_long_c) %>%
mutate(total_gate_size_on = coalesce(total_gate_size_on, 0),
any_gates_on = total_gate_size_on > 0,
Regulated = coalesce(Regulated, FALSE))
site_vars <- daily_vars %>%
group_by(Site_Year) %>%
summarise(across(c("pv", "npv", "bs", "wet", "water", "veg_areas"), mean)) %>%
ungroup() %>%
mutate(watercomb = wet + water) %>%
left_join(site_metadata_sf %>%
st_drop_geometry() %>%
transmute(Site_Year, WMA, FloodScore = factor(FloodScore),
Site_type = factor(Site_type), Water_permanence = factor(Water_permanence)),
by = join_by(Site_Year))
For this analysis we model occupancy and call rate using daily maximum call probability scores. We obtained these scores from the raw model output and processed them in a separate script available in this repository: frog_daily_subset.R. This data comes in two chunks (2018-2021 and 2021-2024).
# Subset to site years
site_year <- nights_dep_joined %>% select(Site, Site_Year, Date) %>%
unique() %>%
left_join(site_metadata_sf %>% st_drop_geometry() %>% select(Site_Year, Year, Cluster, Loc), by = join_by(Site_Year))
# List of species we have data for
species_list <- c(`Barking Marsh Frog` = 13059,
`Eastern Sign-bearing Froglet` = 13131,
`Common Froglet` = 13134,
# `Sloane's Froglet` = 13135,
`Common Spadefoot Toad` = 13086,
`Peron's Tree Frog` = 13204,
`Pobblebonk Frog` = 63913,
`Spotted Marsh Frog (race unknown)` = 13063)
nspecies <- length(species_list)
sp_l_col <- paste0("sp_", species_list)
# species_cols <- str_extract_all(last(colnames(site_records)), "[-+]?[0-9]+")[[1]]
site_records_all <- list()
daily_files <- list.files("data/daily_data/OtherYears/", full.names = T)
files_to_read <- list.files("data/daily_data/OtherYears/")
sp_read_order <- stringr::str_remove_all(files_to_read, c("daily_max_|.csv"))
for(i in 1:length(daily_files)) {
site_records_all[[i]] <- read_csv(daily_files[i], show_col_types = F) %>%
select(-1) %>%
mutate(DateTime = as.POSIXct(DateTime, tz = Sys.timezone()),
Date = coalesce(Date, as.Date(DateTime - hours(12)))) %>%
mutate(`Site` = case_when(Site == "Little Rushy Swamp_AM1" ~ "Little Rushy_AM1",
Site == "Little Rushy Swamp_AM2" ~ "Little Rushy_AM2",
TRUE ~ `Site`)) %>%
mutate(Validated_Grp = as.character(species_list[[sp_read_order[i]]]),
Validation_Species = sp_read_order[i],
Validated_Grp_score = !!sym(sp_read_order[i])) %>%
select(FileId, Filename, Site, Date, DateTime, Orig_Start, Orig_End,
Validated_Grp, Validation_Species, Validated_Grp_score) %>%
left_join(site_year) %>%
filter(!is.na(Year))
}
names(site_records_all) <- sp_read_order
#### 2021-24 ####
site_records_2023 <- list()
daily_files <- list.files("data/daily_data/2021-2024/", full.names = T)
files_to_read <- list.files("data/daily_data/2021-2024/")
sp_read_order <- stringr::str_remove_all(files_to_read, c("daily_max_|.csv"))
for(i in 1:length(daily_files)) {
site_records_2023[[i]] <- read_csv(daily_files[i], show_col_types = F) %>%
select(-1) %>%
mutate(Validated_Grp = as.character(species_list[[sp_read_order[i]]]),
Validation_Species = sp_read_order[i],
Validated_Grp_score = !!sym(sp_read_order[i])) %>%
select(FileId, Filename, Site, Date, DateTime, Orig_Start, Orig_End,
Validated_Grp, Validation_Species, Validated_Grp_score) %>%
left_join(site_year) %>%
filter(!is.na(Year))
}
names(site_records_2023) <- sp_read_order
site_records_combined <- list()
for(x in names(species_list)) {
site_records_combined[[x]] <- bind_rows(site_records_all[[x]], site_records_2023[[x]]) %>%
mutate(SnippetID = paste(FileId, Site, Date, Orig_Start, Orig_End, sep = "_"))
}
Up to 500 daily calls for each species were validated, with validations for each species covering ranges of probability scores from 0 to 1. We read in these validations and visualise the distribution for ‘true positives’ and ‘false positives’.
# site_validation <- read_excel("data/TLM_Barmah-Millewa_2022-23_validations_ALL.xlsx") %>%
# mutate(Site = case_when(Site == "Warri" ~ "Warri Yards",
# TRUE ~ Site),
# Site = paste(Site, AM, sep = "_AM"))
files_to_read <- list.files("data/validated_calls/OtherYears")
sp_read_order <- stringr::str_remove(files_to_read, "_validation.csv")
validated_calls <- list()
for(i in 1:nspecies) {
validated_calls[[sp_read_order[i]]] <- readr::read_csv(paste0("data/validated_calls/OtherYears/", files_to_read[i]),
show_col_types = F) %>%
mutate(Validated_Grp = as.character(species_list[[sp_read_order[i]]]),
Validation_Species = sp_read_order[i],
Validated_Grp_score = !!sym(sp_read_order[i]),
Date = as.Date(Date, "%d/%m/%Y")) %>%
mutate(Detected = as.integer(stringr::str_detect(Validation_TaxonIDs, Validated_Grp)),
SnippetID = paste(FileId, Site, Date, Orig_Start = Orig_Start_Time, Orig_End = Orig_End_Time, sep = "_")) %>%
select(Validated_Grp, Validation_Species, Validated_Grp_score, SnippetID, Detected)
}
files_to_read_2 <- list.files("data/validated_calls/2021-2024")
sp_read_order_2 <- stringr::str_remove(files_to_read_2, "_validation.csv")
validated_calls_2 <- list()
for(i in 1:nspecies) {
validated_calls_2[[sp_read_order_2[i]]] <- validated_calls[[sp_read_order_2[i]]] %>% bind_rows(readr::read_csv(paste0("data/validated_calls/2021-2024/", files_to_read_2[i]),
show_col_types = F) %>%
mutate(Validated_Grp = as.character(species_list[[sp_read_order_2[i]]]),
Validation_Species = sp_read_order_2[i],
Validated_Grp_score = !!sym(sp_read_order_2[i]),
Date = as.Date(Date, "%d/%m/%Y")) %>%
mutate(Detected = as.integer(stringr::str_detect(Validation_TaxonIDs, Validated_Grp)),
SnippetID = paste(FileId, Site, Date, Orig_Start = Orig_Start_Time, Orig_End = Orig_End_Time, sep = "_")) %>%
select(Validated_Grp, Validation_Species, Validated_Grp_score, SnippetID, Detected))
}
records_validated <- list()
for(x in names(species_list)) {
records_validated[[x]] <- left_join(site_records_combined[[x]], validated_calls_2[[x]]) %>%
mutate(Detected = coalesce(Detected, 2L)) %>%
arrange(Site_Year)
}
validated_calls_combined <- bind_rows(records_validated) %>%
filter(Detected != 2) %>%
rowwise() %>%
mutate(Prob_f = case_when(Validated_Grp_score == 1 ~ 0.999,
Validated_Grp_score == 0 ~ 0.001,
TRUE ~ Validated_Grp_score),
Prob_One_M = qlogis(Prob_f))
validated_calls_combined %>%
mutate(Detected = as.factor(Detected)) %>%
filter(Validation_Species != "Common Spadefoot Toad") %>%
ggplot() +
geom_density(aes(x = Prob_f, fill = Detected, colour = Detected), position = "jitter", alpha = 0.25) +
facet_wrap(~Validation_Species) +
xlab("Classifier-assigned probability") +
delwp_theme()
Figure 1: Distributions of probability scores assigned by the classifier
For the model, we filter out common spadefoot toad as we have no validated detections for this species. Below we format the acoustic, site, and nightly data into a format that can be used in the custom STAN model.
# Generate STAN data
sp_out <- c("Common Spadefoot Toad")
# "Pobblebonk Frog",
# "Spotted Marsh Frog (race unknown)")
species_list_cleaned <- species_list[which(!(names(species_list) %in% sp_out))]
records_validated <- records_validated[which(!(names(records_validated) %in% sp_out))]
nspecies_cleaned <- length(species_list_cleaned)
sp_l_col_cleaned <- paste0("sp_", species_list_cleaned)
# joined data
joined_data <- list()
site_data <- list()
# indexes for species
site_names <- unique(records_validated[[1]]$Site_Year)
nsites <- length(unique(records_validated[[1]]$Site_Year))
empty_array <- array(dim = c(nsites, length(species_list_cleaned)))
start_idx_0 <- empty_array
end_idx_0 <- empty_array
start_idx_1 <- empty_array
end_idx_1 <- empty_array
start_idx_2 <- empty_array
end_idx_2 <- empty_array
any_seen <- empty_array
site_idx_start <- empty_array
site_idx_end <- empty_array
call_rate_matrix <- list()
occ_matrix <- list()
joined_data <- readRDS("data/intermediate/joined_data.rds")
for(i in 1:length(species_list_cleaned)) {
### Because we are randomly sampling ####
# joined_data[[i]] <- records_validated[[i]] %>%
# inner_join(daily_vars %>% na.omit()) %>%
# mutate(Prob_One = Validated_Grp_score,
# Site_f = as.integer(factor(Site_Year)),
# Prob_f = case_when(Prob_One == 1 ~ 0.999,
# Prob_One == 0 ~ 0.001,
# TRUE ~ Prob_One),
# Prob_One_M = qlogis(Prob_f),
# weight = case_when(Detected == 2 ~ 0.01,
# TRUE ~ 1)) %>%
# group_by(Site_f) %>%
# mutate(any_seen = case_when(any(Detected == 1) ~ 1,
# TRUE ~ 0)) %>%
# slice_sample(n = 30, weight_by = weight) %>%
# ungroup() %>%
# arrange(Site_f, Detected)
site_data[[i]] <- joined_data[[i]] %>%
mutate(Loc = as.factor(Site) %>% as.integer(),
Cluster = as.factor(str_remove_all(Site, "_AM1|_AM2")) %>% as.integer()) %>%
select(Site_f, Loc, Cluster, Year, Site_Year, any_seen, Site) %>%
distinct() %>%
arrange(Site_f) %>%
left_join(site_vars, by = "Site_Year")
# occupancy model
psi_form <- ~ scale(sqrt(wet)) + scale(pv) + scale(sqrt(water)) + Site_type + Water_permanence # + FloodScore
occ_matrix[[i]] <- model.matrix(psi_form, data = site_data[[i]])
# Daily precipitation matrix
theta_form <- ~ scale(sqrt(Precipitation)) + scale(tmin) + scale(sqrt(wet)) + scale(sqrt(water)) + scale(pv) + Regulated + any_gates_on + Regulated*any_gates_on #+ cosDay + sinDay
call_rate_matrix[[i]] <- model.matrix(theta_form, data = joined_data[[i]])
for(j in 1:nsites) {
start_idx_0[j,i] <- min(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 0))
end_idx_0[j,i] <- max(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 0))
start_idx_1[j,i] <- min(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 1))
end_idx_1[j,i] <- max(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 1))
start_idx_2[j,i] <- min(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 2))
end_idx_2[j,i] <- max(which(joined_data[[i]]$Site_f == j & joined_data[[i]]$Detected == 2))
site_idx_start[j,i] <- min(which(joined_data[[i]]$Site_f == j))
site_idx_end[j,i] <- max(which(joined_data[[i]]$Site_f == j))
}
start_idx_0[is.infinite(start_idx_0[,i]),i] <- 0
start_idx_1[is.infinite(start_idx_1[,i]),i] <- 0
start_idx_2[is.infinite(start_idx_2[,i]),i] <- 0
end_idx_0[is.infinite(end_idx_0[,i]),i] <- 0
end_idx_1[is.infinite(end_idx_1[,i]),i] <- 0
end_idx_2[is.infinite(end_idx_2[,i]),i] <- 0
any_seen[,i] <- as.integer(start_idx_1[,i] != 0)
}
# saveRDS(joined_data, "data/intermediate/joined_data.rds")
In addition to the validation data, we also have previous validations at a site-level that notes whether any calls were detected at that site in that year. These are included in the model as if restricts the probability space we need to marginalize over for sites that have at least one detection.
# read in updated any seen validations
any_seen_add <- readr::read_csv("data/any_seen/sp_detect_per_site_per_year.csv") %>%
mutate(`Site_AM` = case_when(Site_AM == "Little Rushy Swamp_AM1" ~ "Little Rushy_AM1",
Site_AM == "Little Rushy Swamp_AM2" ~ "Little Rushy_AM2",
Site_AM == "Hughes Crossing_AM1" ~ "Hughs Crossing_AM1",
Site_AM == "Hughes Crossing_AM2" ~ "Hughs Crossing_AM2",
Site_AM == "Reed Beds Swamp_AM1" ~ "Reed Bed Swamp_AM1",
Site_AM == "Reed Beds Swamp_AM2" ~ "Reed Bed Swamp_AM2",
Site_AM == "Wathours Lagoon_AM1" ~ "Wathours_AM1",
Site_AM == "Wathours Lagoon_AM2" ~ "Wathours_AM2",
TRUE ~ `Site_AM`)) %>%
mutate(Site_Year = paste(Site_AM, Year, sep = "_")) %>%
arrange(Site_Year)
setdiff(site_names, any_seen_add$Site_Year) %>% paste(collapse = "\n") %>% cat()
any_seen_clean <- any_seen_add %>% filter(Site_Year %in% site_names) %>%
select(`Barking Marsh Frog`,
`Eastern Sign-bearing Froglet` = `Eastern Sign-Bearing Froglet`,
`Common Froglet`,
`Peron's Tree Frog`,
`Pobblebonk Frog` = Pobblebonk,
`Spotted Marsh Frog (race unknown)` = `Spotted Marsh Frog`) %>%
as.matrix()
any_seen_joined <- any_seen_clean
for(j in 1:ncol(any_seen_clean)) {
for(i in 1:nrow(any_seen_clean))
any_seen_joined[i,j] <- max(any_seen[i,j], any_seen_clean[i,j])
}
We then combine all the data into a list, which will be used by the STAN model
nfiles <- nrow(joined_data[[1]])
scores <- lapply(joined_data, function(x) pull(x, Prob_One_M))
site_list <- lapply(joined_data, function(x) pull(x, Site_f))
loc_code <- lapply(site_data, function(x) pull(x, Loc))
cluster_code <- lapply(site_data, function(x) pull(x, Cluster))
year_code <- lapply(site_data, function(x) pull(x, Year))
detected <- lapply(joined_data, function(x) pull(x, Detected))
site_code <- lapply(joined_data, function(x) pull(x, Site_f))
nyear <- length(unique(unlist(year_code)))
nloc <- length(unique(unlist(loc_code)))
nclusters <- length(unique(unlist(cluster_code)))
stan_data <- list(nfiles = nfiles,
nsites = nsites,
nspec = nspecies_cleaned,
nyear = nyear,
nloc = nloc,
nclusters = nclusters,
score = array(unlist(scores),dim=c(nfiles,nspecies_cleaned)),
site_code = array(unlist(site_code),dim=c(nfiles,nspecies_cleaned)),
loc_code = array(unlist(loc_code),dim=c(nsites,nspecies_cleaned)),
cluster_code = array(unlist(cluster_code),dim=c(nsites,nspecies_cleaned)),
year_code = array(unlist(year_code),dim=c(nsites,nspecies_cleaned)),
year_counts = unname(apply(array(unlist(year_code),dim=c(nsites,nspecies_cleaned)),2,table)),
year_counts_file = unname(as.integer(table(joined_data[[1]]$Year))),
site_idx_start = site_idx_start,
site_idx_end = site_idx_end,
start_idx_0 = start_idx_0,
end_idx_0 = end_idx_0,
start_idx_1 = start_idx_1,
end_idx_1 = end_idx_1,
start_idx_2 = start_idx_2,
end_idx_2 = end_idx_2,
any_seen = any_seen_joined,
theta_mm = call_rate_matrix,
theta_nc = ncol(call_rate_matrix[[1]]),
psi_mm = occ_matrix,
psi_nc = ncol(occ_matrix[[1]]),
detected = array(unlist(detected),dim=c(nfiles,nspecies_cleaned)))
# stan_data$year_code[stan_data$year_code == 5] <- 4
stan_data_beta <- stan_data
stan_data_beta$score <- array(unlist(lapply(joined_data, function(x) pull(x, Prob_f))),dim=c(nfiles,nspecies_cleaned))
saveRDS(stan_data, "data/stan_data.rds")
Below is the STAN model used in our analysis. It is a multi-species, multi-season model. We model observations as a mixture of validated detections and unvalidated probable detections, using the probability scores assigned by the acoustic classifier as the response variable in a beta regression
data {
int<lower=0> nfiles; // number of files to process (observation periods)
int<lower=0> nsites; // number of site-year combos
int<lower=0> nspec; // number of species
int<lower=0> nyear; // number of years
int<lower=0> nclusters; // number of clusters (pair of recorders)
int<lower=0> nloc; // number of site locations (independent of years)
array[nfiles, nspec] real score; // occupancy detection score
array[nfiles, nspec] int detected; // whether or not was deteted
array[nfiles, nspec] int site_code; // file to site code
array[nsites, nspec] int loc_code;
array[nsites, nspec] int cluster_code;
array[nsites, nspec] int year_code;
array[nyear, nspec] real year_counts;
real year_counts_file[nyear];
array[nsites, nspec] int site_idx_start;
array[nsites, nspec] int site_idx_end;
int start_idx_0[nsites, nspec];
int end_idx_0[nsites, nspec];
int start_idx_1[nsites, nspec];
int end_idx_1[nsites, nspec];
int start_idx_2[nsites, nspec];
int end_idx_2[nsites, nspec];
int any_seen[nsites, nspec];
int<lower=0> theta_nc;
array[nspec] matrix[nfiles, theta_nc] theta_mm; // model matrix
int<lower=0> psi_nc;
array[nspec] matrix[nsites, psi_nc] psi_mm; // occ model matrix
}
// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
// mu
real mu_int[2];
real sigma_int[2];
// re for mu
array[2, nspec] real mu_raw;
real<lower=0> mu_sd[2];
// re for sigma
array[2, nspec] real sigma_raw;
real<lower=0> sigma_sd[2];
// real alpha[nspec];
// location random effect
array[nloc, nspec] real loc_raw;
real<lower=0> loc_sd[nspec];
// year random effect
array[nyear, nspec] real year_raw;
real<lower=0> year_sd[nspec];
// site random effect
array[nsites, nspec] real site_raw;
real<lower=0> site_sd[nspec];
// files random effect
// array[nfiles, nspec] real file_raw;
// real<lower=0> file_sd[nspec];
// call rate coefficient
array[nspec] vector[theta_nc] beta_theta; //call rate
// occ coef
array[nspec] vector[psi_nc] beta_psi; //call rate
}
transformed parameters {
array[nsites, nspec] real<lower=0,upper=1> psi;
array[nspec] vector<lower=0,upper=1>[nfiles] theta;
real<lower=0> sigma[2, nspec];
real<lower=0> sigma_var[2, nspec];
// random effects
array[nloc, nspec] real eps_loc;
array[nyear, nspec] real eps_year;
array[nsites, nspec] real eps_site;
array[2, nspec] real<lower=0, upper=1> mu_mean;
array[2, nspec] real<lower=0> mu;
for(j in 1:nspec) {
// mu and sigma beta param
mu_mean[1,j] = inv_logit(mu_int[1] + mu_sd[1] * mu_raw[1,j]);
mu_mean[2,j] = inv_logit(mu_int[2] + mu_sd[2] * mu_raw[2,j]);
// linear model for sigma
sigma_var[1,j] = exp(sigma_int[1] + sigma_sd[1] * sigma_raw[1,j]);
sigma_var[2,j] = exp(sigma_int[2] + sigma_sd[2] * sigma_raw[2,j]);
// beta parameterization
mu[1,j] = mu_mean[1,j] .* sigma_var[1,j];
mu[2,j] = mu_mean[2,j] .* sigma_var[2,j];
sigma[1,j] = (sigma_var[1,j]-mu_mean[1,j] .* sigma_var[1,j]);
sigma[2,j] = (sigma_var[2,j]-mu_mean[2,j] .* sigma_var[2,j]);
// locations
for(l in 1:nloc) {
eps_loc[l,j] = loc_sd[j] * loc_raw[l,j];
}
// years
for(y in 1:nyear) {
eps_year[y,j] = year_sd[j] * year_raw[y,j];
}
for(i in 1:nsites) {
eps_site[i,j] = site_sd[j] * site_raw[i,j];
psi[i,j] = inv_logit(psi_mm[j, i] * beta_psi[j] + eps_loc[loc_code[i,j], j] + eps_year[year_code[i,j], j]); // + eps_site[i,j]);
}
for(k in 1:nfiles) {
theta[j,k] = inv_logit(theta_mm[j,k] * beta_theta[j] + eps_site[site_code[k,j],j]); // + (file_sd[j] * to_vector(file_raw[,j])));
}
}
}
model {
// priors
for(j in 1:nspec) {
// alpha[j] ~ normal(0,2);
// re priors
loc_raw[,j] ~ normal(0,1);
loc_sd[j] ~ normal(0,1);
year_raw[,j] ~ normal(0,1);
year_sd[j] ~ normal(0,1);
site_raw[,j] ~ normal(0,1);
site_sd[j] ~ normal(0,1);
// file_raw[,j] ~ normal(0,1);
// file_sd[j] ~ normal(0,1);
beta_theta[j] ~ normal(0,2);
beta_psi[j] ~ normal(0,2);
}
// mu priors
mu_int[1] ~ normal(0,10);
mu_int[2] ~ normal(0,10);
mu_raw[1,] ~ normal(0,1);
mu_raw[2,] ~ normal(0,1);
mu_sd ~ normal(0,1);
//sigma priors
sigma_int[1] ~ normal(0,3);
sigma_int[2] ~ normal(0,3);
sigma_raw[1,] ~ normal(0,1);
sigma_raw[2,] ~ normal(0,1);
sigma_sd ~ normal(0,1);
for(j in 1:nspec) {
for(i in 1:nsites) {
if(start_idx_0[i,j] != 0) {
if(any_seen[i,j] == 0) {
target += log1m(psi[i,j]) + beta_lpdf(score[start_idx_0[i,j]:end_idx_0[i,j],j] | mu[1, j], sigma[1,j]);
}
target += log(psi[i,j]) + log1m(theta[j, start_idx_0[i,j]:end_idx_0[i,j]])
+ beta_lpdf(score[start_idx_0[i,j]:end_idx_0[i,j],j] | mu[1, j], sigma[1,j]);
}
if(start_idx_1[i,j] != 0) {
target += log(psi[i,j]) + log(theta[j, start_idx_1[i,j]:end_idx_1[i,j]])
+ beta_lpdf(score[start_idx_1[i,j]:end_idx_1[i,j],j] | mu[2, j], sigma[2,j]);
}
if(start_idx_2[i,j] != 0) {
if(any_seen[i,j] == 0) {
target += log1m(psi[i,j])
+ beta_lpdf(score[start_idx_2[i,j]:end_idx_2[i,j],j] | mu[1, j], sigma[1,j]);
}
target += log_sum_exp(log(psi[i,j])
+ log_sum_exp(log1m(theta[j, start_idx_2[i,j]:end_idx_2[i,j]]))
+ beta_lpdf(score[start_idx_2[i,j]:end_idx_2[i,j],j] | mu[1, j], sigma[1,j]),
log(psi[i,j])
+ log_sum_exp(log(theta[j, start_idx_2[i,j]:end_idx_2[i,j]]))
+ beta_lpdf(score[start_idx_2[i,j]:end_idx_2[i,j],j] | mu[2, j], sigma[2,j]));
}
}
}
}
generated quantities {
int<lower=0,upper=1> zm[nsites, nspec];
int<lower=0,upper=1> z[nsites, nspec];
int<lower=0> z_sum[nyear, nspec];
array[nyear, nspec] real z_hat;
int<lower=0> theta_sum[nyear, nspec];
array[nyear, nspec] real theta_hat;
array[nsites, nspec] real theta_site;
array[nsites, nspec] real rel_ab;
array[nyear, nspec] real rel_ab_sum;
array[nyear, nspec] real rel_ab_mean;
real mu_pred[2,nspec];
// real theta_phen [365,nspec];
array[nspec] vector<lower=0,upper=1>[nfiles] det_pred;
array[nfiles, nspec] int<lower=0,upper=1> det_z;
array[nfiles, nspec] real score_pred;
array[nsites] int<lower=0> species_richness;
for(j in 1:nspec) {
zm[,j] = bernoulli_rng(psi[,j]);
mu_pred[, j] = beta_rng(mu[,j], sigma[,j]);
// for(i in 1:365) {
// // last two predictors must be cos and sin days
// theta_phen[i,j] = inv_logit(beta_theta[j,1] + beta_theta[j,theta_nc-1]*cos(2*pi()*i/365) + beta_theta[j,theta_nc]*sin(2*pi()*i/365));
// }
for(y in 1:nyear) z_sum[y,j] = 0;
for(y in 1:nyear) theta_sum[y,j] = 0;
for(y in 1:nyear) rel_ab_sum[y,j] = 0;
for(n in 1:nsites) {
det_pred[j, site_idx_start[n,j]:site_idx_end[n,j]] = psi[n,j] * theta[j,site_idx_start[n,j]:site_idx_end[n,j]];
det_z[site_idx_start[n,j]:site_idx_end[n,j],j] = bernoulli_rng(det_pred[j,site_idx_start[n,j]:site_idx_end[n,j]]);
}
for(k in 1:nfiles) {
if(detected[k,j] != 2) {
det_z[k,j] = detected[k,j];
}
if(det_z[k,j] == 0) {
score_pred[k,j] = beta_rng(mu[1, j], sigma[1, j]);
} else if(det_z[k,j] == 1) {
score_pred[k,j] = beta_rng(mu[2, j], sigma[2, j]);
}
}
for(n in 1:nsites) {
theta_site[n,j] = mean(det_z[site_idx_start[n,j]:site_idx_end[n,j],j]);
theta_sum[year_code[n,j],j] += sum(det_z[site_idx_start[n,j]:site_idx_end[n,j],j]);
z[n,j] = max(zm[n,j], any_seen[n,j]);
rel_ab[n,j] = z[n,j]*theta_site[n,j];
rel_ab_sum[year_code[n,j],j] += rel_ab[n,j];
z_sum[year_code[n,j],j] += z[n,j];
}
}
for(j in 1:nspec) {
for(y in 1:nyear) {
z_hat[y,j] = z_sum[y,j]/year_counts[y,j];
rel_ab_mean[y,j] = rel_ab_sum[y,j]/year_counts[y,j];
theta_hat[y,j] = theta_sum[y,j]/year_counts_file[y];
}
}
// site-level community score
for(n in 1:nsites) {
species_richness[n] = sum(z[n,]);
}
}# cont_model_ms_norm <- cmdstan_model("stan/continuous_model_ms_my_norm.stan")
cont_model_ms_beta <- cmdstan_model("stan/continuous_model_ms_my_beta.stan")
# cont_model_ms_normal <- cmdstan_model("stan/continuous_model_ms_my_normal.stan")
ni <- 250 #samples
nw <- 250 #warmups
nc <- 8 #chains
# fit_normal <- cont_model_ms_normal$sample(data = stan_data_beta,
# chains = nc,
# parallel_chains = nc,
# show_messages = TRUE,
# save_warmup = FALSE,
# iter_sampling = ni,
# iter_warmup = nw, refresh = 50)
#
# fit_normal$save_object("outputs/fit_normal.rds")
fit_beta <- cont_model_ms_beta$sample(data = stan_data_beta,
chains = nc,
parallel_chains = nc,
show_messages = TRUE,
save_warmup = FALSE,
iter_sampling = ni,
iter_warmup = nw, refresh = 50)
fit_beta$save_object("outputs/fit_beta.rds")
fit_beta <- readRDS("outputs/fit_beta.rds")
Six consecutive years of acoustic data was analysed. Acoustic monitoring began in the Austral Spring/Summer of 2018-2019 and has continued through until the most recent 2023-2024 season. Throughout the six-year monitoring period a total of 36 sites with paired songmeters (n = 72) were surveyed. However, not each songmeter-site was surveyed each year, or in some cases equipment errors gave no survey data, leading to a total of 234 songmeter-sites being surveyed across the six years. Songmeters were deployed for varying lengths of time, with an average of 124 nights and a maximum of 241 nights. Acoustic recordings were processed using an automated species-classification recogniser.
The output of this acoustic classifier model produced 11,633,143 snippets of possible frog detections for the following species: Barking Marsh Frog, Eastern Sign-bearing Froglet, Common Froglet, Common Spadefoot Toad, Peron’s Tree Frog, Pobblebonk Frog, Spotted Marsh Frog (race unknown). Each snippet was encoded with a vector of ‘probability scores’ for each species; which describe the probabilistic assignment of a given call snippet to a given species. For this analysis, we subset the over 11-million records into a snippet with the species-level daily maximum probability score, leaving us with 29,213 possible call snippets for each species across all site-year combinations. For each species between 338 and 441 call snippets were selected using stratified random sampling to be validated. To ensure we were validating a range of ‘probability scores’ for each species this stratification enabled balanced sampling across five ‘probability score’ strata (0 - 0.2, 0.2 - 0.4, 0.4 - 0.6, 0.6 - 0.8 and 0.8 - 1).
The final data includes a mix of validated and not validated snippets for each species and for each songmeter-site across the surveys years. Each snippet has a ‘probability score’ between 0 - 1 as well as a validation label (0 = validated: species not heard, 1 = validated: species heard, and 2 = not validated).
We collected and compiled data that may vary spatially and temporally. Data compiled can be used in the modelling of either; (i) occupancy or (ii) call rate/activity. Whereby data associated with occupancy will be related to conditions during the span of the survey in that year (e.g. multiple months). Alternatively data associated with call rate was collected at a daily/nightly frequency, as it seeks to explain variation in calling activity over the length of deployment within an occupied site.
Hydrology is expected to be a key driver of frog occupancy and activity within Barmah-Millewa. In this analysis we used remotely-sensed estimates of water, wetness and photosynthetic vegetation (“Product Catalogue - Geoscience Australia,” n.d.). This process was supported by the use of Digital Earth Australia’s Wetlands Insight Tool (WIT). Using Landsat 5,7, and 8, we estimated fractional cover of freestanding water, wetness, photosynthetic vegetation, dry vegetation and bare soil. For each songmeter-site we obtained time-series data across the six survey years within a 100m buffer surrounding the songmeter. Landsat measurements occurred approximately every 15 days. To estimate the fractional cover metrics on a daily-level we interpolated the data between measurements. We implemented local polynomial regression to enable daily predictions of fractional vegetation and water coverage using the loess() R function (Cleveland, Grosse, and Shyu 2017).
Active regulation data was obtained via a time-series of regulator usage and data linking each regulator with sites that would be impacted by their usage. Active regulation data as available on a daily-basis, with a uniform 2-day lag implemented between the regulator function (ON/OFF) and any modelled effect on sites.
Daily temperature and precipitation are anticipated to impact frog calling activity. To obtain estimates of daily minimum temperature and precipitation we used National Oceanic and Atmospheric Administration (NOAA) data of gridded daily precipitation and temperature (Oceanic and Labratory) 2022). Daily precipitation and minimum temperature values were estimated at each site at a broad spatial scale (0.5 x 0.5 degrees), thus the variation in temperature and precipitation is largely temporal rather than spatial.
Barmah-Millewa is a dynamic ecological system; the habitat suitability for analysed frog species is anticipated to vary according to hydrological conditions; which vary spatially and temporally. While multi-year occupancy models are often analysed using models that estimate local colonisation and extinction rates at each site (MacKenzie et al. 2003), this process becomes more difficult when data is sparse (i.e., only 5 out of 36 sites were measured consistently across the six survey years). When data is sparse and sites are not surveyed consistently then transition parameters (colonisation and extinction) at sites become more difficult to estimate. Given the sparsity of data, and that site characteristics can vary substantially between years we opt for use of a ‘stacked’ multiseason model. A ‘stacked’ multiseason model effectively models site-year combinations independently (as different sites), accounting for a particular sites non-independence across years with a site-level random effect (Kellner et al. 2022). This approach may be particularly well-suited to our data, as primary drivers of occupancy and call activity are temporally variable (hydrological conditions and climate).
Unlike traditional occupancy models that account for imperfect detection, we modelled occupancy and call activity using a continuous score bounded between 0 and 1 (i.e., the ‘probability scores’). To do this we implemented the methods from (Rhinehart, Turek, and Kitzes 2022). To do this, occupancy at a songmeter-site for each species and a given survey year was modelled as:
\[z_{si} \sim Bernoulli(\psi_{si})\] Whereby \(\psi_i\) is the probability of occupancy at that songmeter-site-year; modelled as a logit-link linear model:
\[\psi_{s,i} = \alpha_{s} + \beta_s \chi_i + \epsilon_{j(s,i)} + \zeta_{y(s,i)} \] Here, \(\alpha_s\) is the intercept for species s, \(\beta_s \chi_i\) is the contribution of fixed-effect covariates. Additionally random effects for songmeter site (j) and year (y) are denoted by \(\epsilon_{j(s,i)}\) and \(\zeta_{y(s,i)}\) respectively.
The call-rate is modelled through a Bernoulli model:
\[f_{k(s,i)} \sim Bernoulli(z_{s,i} \theta_{k(s,i)})\] Where \(f_{k(s,i)}\) denotes the audio snippet k at site i. The true occupancy at a site here is denoted by \(z_{s,i}\), thus when \(z_{s,i} = 0\), the species will not be present in any files. If a species is present at a site (\(z_{s,i} = 1\)), then the frequency the species call appears in files is given by the parameter estimating call rate (\(\theta_{k(s,i)}\)). We model this parameter as a logit-link linear model:
\[\theta_{k(s,i)} = \delta_{s} + \eta_s \chi_{k(s,i)} + \xi_{k(s,i)} \] Here, \(\delta_{s}\)is the intercept, \(\eta_s \chi_{k(s,i)}\) is the contribution of the fixed-effects for daily ovariates that might impact calling rate and \(\xi_{k(s,i)}\) is a random-effect for each observation period.
In Rhinehart, Turek, and Kitzes (2022), a logit-transformed normal distribution is used for the ‘probability scores’. However, here we opt into using a beta distribution for modelling classifier performance and relating the ‘probability scores’ to observations or non-observations. To model this process we parameterize a beta-distribution model with a random-effects linear model, where the random-effect is the contribution to performance of the acoustic classifier for a given species. This allows us to model the performance of the acoustic classifier differently for each species, however it allows us to use ‘share’ information on classifier performance between species; which may be particularly informative in cases where a given species has fewer validated detections and estimation of performance is more difficult within the model. Modelling acoustic classifier performance, or how probability scores from the acoustic classifier relate to validated detections and non-detections is given as:
\[p_{k(s,i)} ∣ f_{k(s,i)} = 0 ∼ Beta(\alpha_{1,s}, \beta_{1,s}) \] \[p_{k(s,i)} ∣ f_{k(s,i)} = 1 ∼ Beta(\alpha_{2,s}, \beta_{2,s}) \] With separate models for cases when the validated probability score (\(p_{k(s,i)}\)) is validated as a non-detection (\(f_{k(s,i)} = 0\)), and a detection (\(f_{k(s,i)} = 1\)). The parameters determining the shape of the distribution (\(\alpha\) and \(\beta\)) are modelled using mean (\(inv\_logit(\mu + \gamma_s)\)) and variance components (\(exp(\sigma + \kappa_s)\)):
\[\alpha = inv\_logit(\mu + \gamma_s) \times exp(\sigma + \kappa_s) \] \[\beta = exp(\sigma + \kappa_s)-inv\_logit(\mu + \gamma_s) \times exp(\sigma + \kappa_s) \] Here \(\mu\) is the intercept for the mean and \(\sigma\) is the intercept for the variance. Species-level random effects for the mean and variance models are given with \(\gamma_s\) and \(\kappa_s\) respectively.
The likelihood functions for each of the possible combinations of file annotations and whether the site is validated as being occupied can be found within Table 1 of (Rhinehart, Turek, and Kitzes 2022), and our STAN code presented above.
Models were written using the STAN programming language and run for 250 warmup and 250 sampling iterations across 8 parallel chains. Model convergence and diagnostics were determined using
Posterior predictive checks can be used to compare model predictions to the original data supplied to the model. Here we compare summary statistics for the probability scores supplied to the model. Ideally, predictions from the model are able to align with the supplied data. We compare 25%, 50% and 75% confidence intervals for the ‘probability scores’.
q95 <- function(x) quantile(x, 0.95, na.rm = T)
q25 <- function(x) quantile(x, 0.25, na.rm = T)
q50 <- function(x) quantile(x, 0.5, na.rm = T)
q75 <- function(x) quantile(x, 0.75, na.rm = T)
sd_90 <- function (x, na.rm = FALSE) {
quants <- quantile(x, c(0.05, 0.95), na.rm = T)
x <- x[x < quants[2] & x > quants[1]]
sqrt(var(if (is.vector(x) || is.factor(x)) x else as.double(x),
na.rm = na.rm))
}
species_list_cleaned2 <- species_list_cleaned
names(species_list_cleaned2)[6] <- "Spotted Marsh Frog"
ppc_plots_25 <- list()
ppc_plots_50 <- list()
ppc_plots_75 <- list()
for(i in 1:length(species_list_cleaned)) {
ppc_plots_25[[i]] <- posterior_checks_multispecies(model = fit_beta,
model_data = joined_data,
species_index = i,
stat = "q25",
title = paste0("25% quantile ", names(species_list_cleaned2)[i]))
ppc_plots_50[[i]] <- posterior_checks_multispecies(model = fit_beta,
model_data = joined_data,
species_index = i,
stat = "q50",
title = paste0("50% quantile ", names(species_list_cleaned2)[i]))
ppc_plots_75[[i]] <- posterior_checks_multispecies(model = fit_beta,
model_data = joined_data,
species_index = i,
stat = "q75",
title = paste0("75% quantile ", names(species_list_cleaned2)[i]))
}
ppc_grid <- cowplot::plot_grid(plotlist = c(ppc_plots_25, ppc_plots_50, ppc_plots_75), nrow = 6, byrow = F)
ggsave(plot = ppc_grid, filename = "outputs/plots/ppcs_normal.pdf", width = 6000, height = 8000, units = "px")
ppc_grid
Figure 2: Posterior predictive checks for the 25%, 50%, and 75% quantiles of backpredicted classifier probability scores against the observed quantiles from validated data
To check model convergence we can visualise traceplots. Good chain-mixing and convergence is evidenced with traceplots from various chains being overlayed and not diverging.
params_to_trace <- c("mu_sd",
"sigma_sd",
"beta_psi",
"beta_theta")
traces <- mcmc_trace(fit_beta$draws(params_to_trace))
traces